import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from scipy.stats import zscore
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_curve, auc
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x] == predictions[x]:
correct += 1
return (correct/float(len(testSet)))*100.0
Import part1 of dataset
p1data = pd.read_csv("Part1 - Car name.csv")
p1data.shape
(398, 1)
The given data set contains 398 rows and 1 columns
Import part 2 of data set
p2data = pd.read_json('Part1 - Car-Attributes.json')
p2data.shape
(398, 8)
The given data set contains 398 rows and 8 columns Merge
merge = pd.concat([p1data, p2data],axis=1, join="inner")
merge.shape
(398, 9)
merge.to_csv('/Users/Alok/Desktop/AIML/Course/unsupervised_learning/merge.csv', index=False)
data1 = pd.read_csv('merge.csv')
data1.head(10)
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
| 5 | ford galaxie 500 | 15.0 | 8 | 429.0 | 198 | 4341 | 10.0 | 70 | 1 |
| 6 | chevrolet impala | 14.0 | 8 | 454.0 | 220 | 4354 | 9.0 | 70 | 1 |
| 7 | plymouth fury iii | 14.0 | 8 | 440.0 | 215 | 4312 | 8.5 | 70 | 1 |
| 8 | pontiac catalina | 14.0 | 8 | 455.0 | 225 | 4425 | 10.0 | 70 | 1 |
| 9 | amc ambassador dpl | 15.0 | 8 | 390.0 | 190 | 3850 | 8.5 | 70 | 1 |
#dropping/ignoring car_name and origin (since origin is categorical data)
cData = data1.drop(['car_name','origin'], axis=1)
cData.head()
| mpg | cyl | disp | hp | wt | acc | yr | |
|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 |
| 1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 |
| 2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 |
| 3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 |
| 4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 |
cData.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 398 non-null float64 1 cyl 398 non-null int64 2 disp 398 non-null float64 3 hp 398 non-null object 4 wt 398 non-null int64 5 acc 398 non-null float64 6 yr 398 non-null int64 dtypes: float64(3), int64(3), object(1) memory usage: 21.9+ KB
In the given dataset hp columns is object may be because it is having non-numerical values
# isdigit()? on 'horsepower'
hpIsDigit = pd.DataFrame(cData.hp.str.isdigit()) # if the string is made of digits store True else False
#print isDigit = False!
cData[hpIsDigit['hp'] == False] # from temp take only those rows where hp has false
| mpg | cyl | disp | hp | wt | acc | yr | |
|---|---|---|---|---|---|---|---|
| 32 | 25.0 | 4 | 98.0 | ? | 2046 | 19.0 | 71 |
| 126 | 21.0 | 6 | 200.0 | ? | 2875 | 17.0 | 74 |
| 330 | 40.9 | 4 | 85.0 | ? | 1835 | 17.3 | 80 |
| 336 | 23.6 | 4 | 140.0 | ? | 2905 | 14.3 | 80 |
| 354 | 34.5 | 4 | 100.0 | ? | 2320 | 15.8 | 81 |
| 374 | 23.0 | 4 | 151.0 | ? | 3035 | 20.5 | 82 |
cData = cData.replace('?', np.nan)
cData[hpIsDigit['hp'] == False]
| mpg | cyl | disp | hp | wt | acc | yr | |
|---|---|---|---|---|---|---|---|
| 32 | 25.0 | 4 | 98.0 | NaN | 2046 | 19.0 | 71 |
| 126 | 21.0 | 6 | 200.0 | NaN | 2875 | 17.0 | 74 |
| 330 | 40.9 | 4 | 85.0 | NaN | 1835 | 17.3 | 80 |
| 336 | 23.6 | 4 | 140.0 | NaN | 2905 | 14.3 | 80 |
| 354 | 34.5 | 4 | 100.0 | NaN | 2320 | 15.8 | 81 |
| 374 | 23.0 | 4 | 151.0 | NaN | 3035 | 20.5 | 82 |
# replace the missing values with median value.
medianFiller = lambda x: x.fillna(x.median())
cData = cData.apply(medianFiller,axis=0)
cData['hp'] = cData['hp'].astype('float64') # converting the hp column from object / string type to float
cData.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 398 non-null float64 1 cyl 398 non-null int64 2 disp 398 non-null float64 3 hp 398 non-null float64 4 wt 398 non-null int64 5 acc 398 non-null float64 6 yr 398 non-null int64 dtypes: float64(4), int64(3) memory usage: 21.9 KB
var = 'origin'
data_plt = pd.concat([data1['mpg'], data1[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="mpg", data=data_plt)
fig.axis(ymin=0, ymax=50)
plt.axhline(data1.mpg.mean(),color='r',linestyle='dashed',linewidth=2)
<matplotlib.lines.Line2D at 0x7fecf0678ca0>
In the plot above, the red line shows the global average of mpg. The majority of cars belonging to origin 1 has average mpg below global average mpg The majority of cars belonging to origin 2 and 3 has average mpg above global average mpg
var = 'cyl'
data_plt = pd.concat([data1['mpg'], data1[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="mpg", data=data_plt)
fig.axis(ymin=0, ymax=50)
plt.axhline(data1.mpg.mean(),color='r',linestyle='dashed',linewidth=2)
<matplotlib.lines.Line2D at 0x7fed108b5fd0>
The above plot shows that majority of the cars having 4 or 5 cylender have above global average mpg.
var = 'yr'
data_plt = pd.concat([data1['mpg'], data1[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="mpg", data=data_plt)
fig.axis(ymin=0, ymax=50)
plt.axhline(data1.mpg.mean(),color='r',linestyle='dashed',linewidth=2)
<matplotlib.lines.Line2D at 0x7fece048f310>
The plot shows that the majority of the cars befor year 79 has average mpg below global average mpg, except for year 74 (where average mpg is slightly above global average mpg)
In the given dataset the car_name attribute contains Company_name - Model name - variant. These can be seperated for further analysis
data1.car_name.unique
<bound method Series.unique of 0 chevrolet chevelle malibu
1 buick skylark 320
2 plymouth satellite
3 amc rebel sst
4 ford torino
...
393 ford mustang gl
394 vw pickup
395 dodge rampage
396 ford ranger
397 chevy s-10
Name: car_name, Length: 398, dtype: object>
data1.index =data1.car_name
data1.index
Index(['chevrolet chevelle malibu', 'buick skylark 320', 'plymouth satellite',
'amc rebel sst', 'ford torino', 'ford galaxie 500', 'chevrolet impala',
'plymouth fury iii', 'pontiac catalina', 'amc ambassador dpl',
...
'chrysler lebaron medallion', 'ford granada l', 'toyota celica gt',
'dodge charger 2.2', 'chevrolet camaro', 'ford mustang gl', 'vw pickup',
'dodge rampage', 'ford ranger', 'chevy s-10'],
dtype='object', name='car_name', length=398)
data1[data1.car_name.str.contains('subaru')].car_name.str.replace('(.*)', 'subaru dl')
data1['Company_Name'] = data1.car_name.str.extract('(^.*?)\s')
data1['Company_Name'] = data1['Company_Name'].replace(['volkswagen','vokswagen','vw'],'VW')
data1['Company_Name'] = data1['Company_Name'].replace('maxda','mazda')
data1['Company_Name'] = data1['Company_Name'].replace('toyouta','toyota')
data1['Company_Name'] = data1['Company_Name'].replace('mercedes','mercedes-benz')
data1['Company_Name'] = data1['Company_Name'].replace('nissan','datsun')
data1['Company_Name'] = data1['Company_Name'].replace('capri','ford')
data1['Company_Name'] = data1['Company_Name'].replace(['chevroelt','chevy'],'chevrolet')
data1.head()
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | Company_Name | |
|---|---|---|---|---|---|---|---|---|---|---|
| car_name | ||||||||||
| chevrolet chevelle malibu | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 | chevrolet |
| buick skylark 320 | buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 | buick |
| plymouth satellite | plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 | plymouth |
| amc rebel sst | amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 | amc |
| ford torino | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 | ford |
var = 'Company_Name'
data_plt = pd.concat([data1['mpg'], data1[var]], axis=1)
f, ax = plt.subplots(figsize=(20,10))
fig = sns.boxplot(x=var, y="mpg", data=data_plt)
fig.set_xticklabels(ax.get_xticklabels(),rotation=30)
fig.axis(ymin=0, ymax=50)
plt.axhline(data1.mpg.mean(),color='r',linestyle='dashed',linewidth=2)
<matplotlib.lines.Line2D at 0x7fed032ddd60>
The above plot shows mpg distribution for the various car manufacturer
#scatterplot
sns.set()
sns.pairplot(data1, size = 2.0,hue ='origin')
plt.show()
/Users/Alok/opt/anaconda3/lib/python3.8/site-packages/seaborn/axisgrid.py:2071: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning) /Users/Alok/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation. warnings.warn(msg, UserWarning) /Users/Alok/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation. warnings.warn(msg, UserWarning)
corr = data1.corr()
# Plotting corelations using heatmap
plt.figure(figsize=(12,10))
plt.title("Correlation Plot")
sns.heatmap(corr, annot=True)
plt.show()
# independant variables
X = cData.drop(['mpg'], axis=1)
# the dependent variable
y = cData[['mpg']]
from scipy.stats import zscore
XScaled=X.apply(zscore)
XScaled.head()
| cyl | disp | hp | wt | acc | yr | |
|---|---|---|---|---|---|---|
| 0 | 1.498191 | 1.090604 | 0.673118 | 0.630870 | -1.295498 | -1.627426 |
| 1 | 1.498191 | 1.503514 | 1.589958 | 0.854333 | -1.477038 | -1.627426 |
| 2 | 1.498191 | 1.196232 | 1.197027 | 0.550470 | -1.658577 | -1.627426 |
| 3 | 1.498191 | 1.061796 | 1.197027 | 0.546923 | -1.295498 | -1.627426 |
| 4 | 1.498191 | 1.042591 | 0.935072 | 0.565841 | -1.840117 | -1.627426 |
#Finding optimal no. of clusters using k mean method
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
clusters=range(1,10)
meanDistortions=[]
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(XScaled)
prediction=model.predict(XScaled)
meanDistortions.append(sum(np.min(cdist(XScaled, model.cluster_centers_, 'euclidean'), axis=1)) / XScaled.shape[0])
plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
Text(0.5, 1.0, 'Selecting k with the Elbow Method')
The location of a knee in the plot is usually considered as an indicator of the appropriate number of clusters because it means that adding another cluster does not improve much better the partition. The above plot shows clear knee in elbow at k=2.
# Using K = 2
final_model=KMeans(2)
final_model.fit(XScaled)
prediction=final_model.predict(XScaled)
#Append the prediction
cData["GROUP"] = prediction
XScaled["GROUP"] = prediction
print("Groups Assigned : \n")
cData.head()
Groups Assigned :
| mpg | cyl | disp | hp | wt | acc | yr | GROUP | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 |
cDataClust = cData.groupby(['GROUP'])
cDataClust.mean()
| mpg | cyl | disp | hp | wt | acc | yr | |
|---|---|---|---|---|---|---|---|
| GROUP | |||||||
| 0 | 26.550340 | 4.568027 | 140.198980 | 85.112245 | 2569.47619 | 16.520408 | 76.768707 |
| 1 | 14.932692 | 7.961538 | 343.894231 | 158.557692 | 4103.87500 | 12.875962 | 73.865385 |
XScaled.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fecf0b50d30>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7fecf0b86280>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7fecf0ba9250>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7fed11698100>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x7fed116bdf70>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7fed03b04e20>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7fed03b0fdc0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7fed03b41c70>]],
dtype=object)
from sklearn.cluster import AgglomerativeClustering
model = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='average')
model.fit(XScaled)
AgglomerativeClustering(linkage='average')
cData['labels'] = model.labels_
cData.head(10)
| mpg | cyl | disp | hp | wt | acc | yr | GROUP | labels | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 | 0 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 | 0 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 | 0 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 | 0 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 | 0 |
| 5 | 15.0 | 8 | 429.0 | 198.0 | 4341 | 10.0 | 70 | 1 | 0 |
| 6 | 14.0 | 8 | 454.0 | 220.0 | 4354 | 9.0 | 70 | 1 | 0 |
| 7 | 14.0 | 8 | 440.0 | 215.0 | 4312 | 8.5 | 70 | 1 | 0 |
| 8 | 14.0 | 8 | 455.0 | 225.0 | 4425 | 10.0 | 70 | 1 | 0 |
| 9 | 15.0 | 8 | 390.0 | 190.0 | 3850 | 8.5 | 70 | 1 | 0 |
cDataClust = cData.groupby(['labels'])
cDataClust.mean()
| mpg | cyl | disp | hp | wt | acc | yr | GROUP | |
|---|---|---|---|---|---|---|---|---|
| labels | ||||||||
| 0 | 14.989423 | 7.980769 | 343.913462 | 158.365385 | 4108.278846 | 12.959615 | 73.942308 | 0.990385 |
| 1 | 26.530272 | 4.561224 | 140.192177 | 85.180272 | 2567.918367 | 16.490816 | 76.741497 | 0.003401 |
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.spatial.distance import pdist #Pairwise distribution between data points
# cophenet index is a measure of the correlation between the distance of points in feature space and distance on dendrogram
# closer it is to 1, the better is the clustering
Z = linkage(XScaled, metric='euclidean', method='average')
c, coph_dists = cophenet(Z , pdist(XScaled))
c
0.8244164715771007
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold = 40, leaf_font_size=8. )
plt.tight_layout()
In the given data set K means method showed two optimal clusters in the given data set. While, using Hierarchical clustering method the dendrogram above shows that the data can be divided in 2 optimal clusters.
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)
model = LinearRegression()
model.fit(x_train, y_train)
LinearRegression()
for idx, col_name in enumerate(x_train.columns):
print("The coefficient for {} is {}".format(col_name, model.coef_[0][idx]))
The coefficient for cyl is -0.18095805032306134 The coefficient for disp is 0.010983679987754645 The coefficient for hp is -0.00898274748809656 The coefficient for wt is -0.007188190332770612 The coefficient for acc is 0.02914290133876269 The coefficient for yr is 0.7883566858707713
intercept = model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
The intercept for our model is -15.621707993406599
model.score(x_test, y_test)
0.826804750114966
cDataClust.head()
| mpg | cyl | disp | hp | wt | acc | yr | GROUP | labels | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 | 0 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 | 0 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 | 0 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 | 0 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 | 0 |
| 14 | 24.0 | 4 | 113.0 | 95.0 | 2372 | 15.0 | 70 | 0 | 1 |
| 15 | 22.0 | 6 | 198.0 | 95.0 | 2833 | 15.5 | 70 | 0 | 1 |
| 16 | 18.0 | 6 | 199.0 | 97.0 | 2774 | 15.5 | 70 | 0 | 1 |
| 17 | 21.0 | 6 | 200.0 | 85.0 | 2587 | 16.0 | 70 | 0 | 1 |
| 18 | 27.0 | 4 | 97.0 | 88.0 | 2130 | 14.5 | 70 | 0 | 1 |
# Splitting data accocding to the clustters
Cluster1=cData[cData['labels']==0]
Cluster2=cData[cData['labels']==1]
X_clust_1 = Cluster1.drop(['mpg', 'GROUP', 'labels'], axis=1)
X_clust_2 = Cluster2.drop(['mpg', 'GROUP', 'labels'], axis=1)
y_clust_1 = Cluster1['mpg']
y_clust_2 = Cluster2['mpg']
X_clust_1.shape
(104, 6)
x_train,x_test,y_train,y_test=train_test_split(X_clust_1,y_clust_1,test_size=0.3,random_state=1)
model = LinearRegression()
model.fit(x_train, y_train)
LinearRegression()
for idx, col_name in enumerate(x_train.columns):
print("The coefficient for {} is {}".format(col_name, model.coef_[idx]))
The coefficient for cyl is -0.6135274395339237 The coefficient for disp is 0.010536398528304375 The coefficient for hp is -0.023955113948680638 The coefficient for wt is -0.002263255788444855 The coefficient for acc is 0.04064114664778857 The coefficient for yr is 0.3340768878533721
intercept = model.intercept_
print("The intercept for our model is {}".format(intercept))
The intercept for our model is 4.094177952246664
model.score(x_test, y_test)
0.6777863870629293
x_train,x_test,y_train,y_test=train_test_split(X_clust_2,y_clust_2,test_size=0.3, random_state=1)
model = LinearRegression()
model.fit(x_train, y_train)
LinearRegression()
for idx, col_name in enumerate(x_train.columns):
print("The coefficient for {} is {}".format(col_name, model.coef_[idx]))
The coefficient for cyl is 0.7654363604403668 The coefficient for disp is -0.026471386560153 The coefficient for hp is -0.06769965902885003 The coefficient for wt is -0.006286827751146289 The coefficient for acc is 0.11158002038064824 The coefficient for yr is 0.8285091971032668
intercept = model.intercept_
print("The intercept for our model is {}".format(intercept))
The intercept for our model is -16.74978168728932
model.score(x_test, y_test)
0.7116026405407475
When all of the dataset is used as single cluster, the linear regression model showed accuracy of 82%. When the data set is split into two cluster and the linear regression model showed accuracy of 67% for cluster 1 and 71% for cluster 2. The decrease in the accuracy is may be due to less dataset available for training and testing purpose while using two different clusters. To imporve the prediction accuracy further more data points are needed for linearregression model in each clusters.
#Loading dataset
wine = pd.read_excel('Part2Company.xlsx')
wine.shape
(61, 5)
wine.head()
| A | B | C | D | Quality | |
|---|---|---|---|---|---|
| 0 | 47 | 27 | 45 | 108 | Quality A |
| 1 | 174 | 133 | 134 | 166 | Quality B |
| 2 | 159 | 163 | 135 | 131 | NaN |
| 3 | 61 | 23 | 3 | 44 | Quality A |
| 4 | 59 | 60 | 9 | 68 | Quality A |
wine.isnull().sum()
A 0 B 0 C 0 D 0 Quality 18 dtype: int64
There are 18 missing values in the dataframe for Quality attribute
# independant variables
X = wine.drop(['Quality'], axis=1)
# the dependent variable
y = wine[['Quality']]
# Scaling the dataset
from scipy.stats import zscore
XScaled=X.apply(zscore)
XScaled.head()
| A | B | C | D | |
|---|---|---|---|---|
| 0 | -1.168034 | -1.561080 | -1.061569 | -0.103138 |
| 1 | 0.904992 | 0.284923 | 0.306077 | 0.823013 |
| 2 | 0.660147 | 0.807376 | 0.321443 | 0.264129 |
| 3 | -0.939512 | -1.630740 | -1.706975 | -1.125099 |
| 4 | -0.972158 | -0.986381 | -1.614775 | -0.741864 |
# Appling K mean classifier
# Using K = 2
final_model=KMeans(2)
final_model.fit(XScaled)
prediction=final_model.predict(XScaled)
#Append the prediction
wine["GROUP"] = prediction
XScaled["GROUP"] = prediction
print("Groups Assigned : \n")
wine.head()
Groups Assigned :
| A | B | C | D | Quality | GROUP | |
|---|---|---|---|---|---|---|
| 0 | 47 | 27 | 45 | 108 | Quality A | 0 |
| 1 | 174 | 133 | 134 | 166 | Quality B | 1 |
| 2 | 159 | 163 | 135 | 131 | NaN | 1 |
| 3 | 61 | 23 | 3 | 44 | Quality A | 0 |
| 4 | 59 | 60 | 9 | 68 | Quality A | 0 |
wine.loc[wine['GROUP'] == 0, 'Quality'] = 'Quality A'
wine.loc[wine['GROUP'] == 1, 'Quality'] = 'Quality B'
wine.head()
| A | B | C | D | Quality | GROUP | |
|---|---|---|---|---|---|---|
| 0 | 47 | 27 | 45 | 108 | Quality A | 0 |
| 1 | 174 | 133 | 134 | 166 | Quality B | 1 |
| 2 | 159 | 163 | 135 | 131 | Quality B | 1 |
| 3 | 61 | 23 | 3 | 44 | Quality A | 0 |
| 4 | 59 | 60 | 9 | 68 | Quality A | 0 |
df = pd.read_csv('Part3 - vehicle.csv')
df.shape
(846, 19)
The dataframe consists of 846 rows and 19 columns
df.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 | van |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 | van |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 | car |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 | van |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 | bus |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 841 non-null float64 2 distance_circularity 842 non-null float64 3 radius_ratio 840 non-null float64 4 pr.axis_aspect_ratio 844 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 845 non-null float64 7 elongatedness 845 non-null float64 8 pr.axis_rectangularity 843 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 843 non-null float64 11 scaled_variance.1 844 non-null float64 12 scaled_radius_of_gyration 844 non-null float64 13 scaled_radius_of_gyration.1 842 non-null float64 14 skewness_about 840 non-null float64 15 skewness_about.1 845 non-null float64 16 skewness_about.2 845 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
Most of the data in the given dataframe is numberical except for the 'Class' attribute which shows category of the vehical
#### Cheacking for null values
df.isnull().sum()
compactness 0 circularity 5 distance_circularity 4 radius_ratio 6 pr.axis_aspect_ratio 2 max.length_aspect_ratio 0 scatter_ratio 1 elongatedness 1 pr.axis_rectangularity 3 max.length_rectangularity 0 scaled_variance 3 scaled_variance.1 2 scaled_radius_of_gyration 2 scaled_radius_of_gyration.1 4 skewness_about 6 skewness_about.1 1 skewness_about.2 1 hollows_ratio 0 class 0 dtype: int64
## Five point summary
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| compactness | 846.0 | 93.678487 | 8.234474 | 73.0 | 87.00 | 93.0 | 100.0 | 119.0 |
| circularity | 841.0 | 44.828775 | 6.152172 | 33.0 | 40.00 | 44.0 | 49.0 | 59.0 |
| distance_circularity | 842.0 | 82.110451 | 15.778292 | 40.0 | 70.00 | 80.0 | 98.0 | 112.0 |
| radius_ratio | 840.0 | 168.888095 | 33.520198 | 104.0 | 141.00 | 167.0 | 195.0 | 333.0 |
| pr.axis_aspect_ratio | 844.0 | 61.678910 | 7.891463 | 47.0 | 57.00 | 61.0 | 65.0 | 138.0 |
| max.length_aspect_ratio | 846.0 | 8.567376 | 4.601217 | 2.0 | 7.00 | 8.0 | 10.0 | 55.0 |
| scatter_ratio | 845.0 | 168.901775 | 33.214848 | 112.0 | 147.00 | 157.0 | 198.0 | 265.0 |
| elongatedness | 845.0 | 40.933728 | 7.816186 | 26.0 | 33.00 | 43.0 | 46.0 | 61.0 |
| pr.axis_rectangularity | 843.0 | 20.582444 | 2.592933 | 17.0 | 19.00 | 20.0 | 23.0 | 29.0 |
| max.length_rectangularity | 846.0 | 147.998818 | 14.515652 | 118.0 | 137.00 | 146.0 | 159.0 | 188.0 |
| scaled_variance | 843.0 | 188.631079 | 31.411004 | 130.0 | 167.00 | 179.0 | 217.0 | 320.0 |
| scaled_variance.1 | 844.0 | 439.494076 | 176.666903 | 184.0 | 318.00 | 363.5 | 587.0 | 1018.0 |
| scaled_radius_of_gyration | 844.0 | 174.709716 | 32.584808 | 109.0 | 149.00 | 173.5 | 198.0 | 268.0 |
| scaled_radius_of_gyration.1 | 842.0 | 72.447743 | 7.486190 | 59.0 | 67.00 | 71.5 | 75.0 | 135.0 |
| skewness_about | 840.0 | 6.364286 | 4.920649 | 0.0 | 2.00 | 6.0 | 9.0 | 22.0 |
| skewness_about.1 | 845.0 | 12.602367 | 8.936081 | 0.0 | 5.00 | 11.0 | 19.0 | 41.0 |
| skewness_about.2 | 845.0 | 188.919527 | 6.155809 | 176.0 | 184.00 | 188.0 | 193.0 | 206.0 |
| hollows_ratio | 846.0 | 195.632388 | 7.438797 | 181.0 | 190.25 | 197.0 | 201.0 | 211.0 |
## Replacing missing values with the median values in the dataFrame
df = df.fillna(df.median())
df.isnull().sum()
compactness 0 circularity 0 distance_circularity 0 radius_ratio 0 pr.axis_aspect_ratio 0 max.length_aspect_ratio 0 scatter_ratio 0 elongatedness 0 pr.axis_rectangularity 0 max.length_rectangularity 0 scaled_variance 0 scaled_variance.1 0 scaled_radius_of_gyration 0 scaled_radius_of_gyration.1 0 skewness_about 0 skewness_about.1 0 skewness_about.2 0 hollows_ratio 0 class 0 dtype: int64
## Plotting distribution of the target class
sns.countplot(df['class']);
Majority of the target class contains cars followed by bus and van. Almost 50% of the dataset contains cars
i=1
plt.figure(figsize=(20,30))
for k in df.columns[0:18] :
plt.subplot(5,4, i)
sns.distplot(df[k])
i=i+1
plt.show()
1) In the given dataset 'compactness' and 'circularity' attirbute seems to be normally distributed.
2) Most of the attributes in the dataset are positively skewed.
#scatterplot
sns.set()
sns.pairplot(df, size = 2.0,hue ='class')
plt.show()
# PLotting distribution of the all the attributes with respect to dependent variable
k=1
plt.figure(figsize=(20,30))
for col in df.columns[0:18]:
plt.subplot(5,4,k)
sns.kdeplot(df[df['class']=='car'][col],color='red',label='car',shade=True)
sns.kdeplot(df[df['class']=='bus'][col],color='blue',label='bus',shade=True)
sns.kdeplot(df[df['class']=='van'][col],color='yellow',label='van',shade=True)
plt.title(col)
k=k+1
k=1
plt.figure(figsize=(20,30))
for col in df.columns[0:18]: # iterating columns except dependent column
plt.subplot(5,4,k)
#sns.boxplot(y=df[col])
sns.boxplot(y=df[col],hue='class', data=df)
k=k+1
1) The 'mean compectness', 'mean circularity' and the 'mean distance circularity' is highest for the cars.
2) The compectness is least for the van, and positively skewed data for bus indicates that few buses have higher compactness.
3) Most of the other attirbutes distribution is similar for the car, van and bus
4) There are outliers in 'columns scaled_cariance', 'scaled_variance.1', 'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1', 'radius_ratio' and 'max_length_aspect_ratio' columns
corr = df.corr()
# Plotting corelations using heatmap
plt.figure(figsize=(12,10))
plt.title("Correlation Plot")
sns.heatmap(corr, annot=True)
plt.show()
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
x = df.drop(['class'], axis=1)
y = df['class']
XScaled=x.apply(zscore)
XScaled.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.160580 | 0.518073 | 0.057177 | 0.273363 | 1.310398 | 0.311542 | -0.207598 | 0.136262 | -0.224342 | 0.758332 | -0.401920 | -0.341934 | 0.285705 | -0.327326 | -0.073812 | 0.380870 | -0.312012 | 0.183957 |
| 1 | -0.325470 | -0.623732 | 0.120741 | -0.835032 | -0.593753 | 0.094079 | -0.599423 | 0.520519 | -0.610886 | -0.344578 | -0.593357 | -0.619724 | -0.513630 | -0.059384 | 0.538390 | 0.156798 | 0.013265 | 0.452977 |
| 2 | 1.254193 | 0.844303 | 1.519141 | 1.202018 | 0.548738 | 0.311542 | 1.148719 | -1.144597 | 0.935290 | 0.689401 | 1.097671 | 1.109379 | 1.392477 | 0.074587 | 1.558727 | -0.403383 | -0.149374 | 0.049447 |
| 3 | -0.082445 | -0.623732 | -0.006386 | -0.295813 | 0.167907 | 0.094079 | -0.750125 | 0.648605 | -0.610886 | -0.344578 | -0.912419 | -0.738777 | -1.466683 | -1.265121 | -0.073812 | -0.291347 | 1.639649 | 1.529056 |
| 4 | -1.054545 | -0.134387 | -0.769150 | 1.082192 | 5.245643 | 9.444962 | -0.599423 | 0.520519 | -0.610886 | -0.275646 | 1.671982 | -0.648070 | 0.408680 | 7.309005 | 0.538390 | -0.179311 | -1.450481 | -1.699181 |
x_train,x_test,y_train,y_test=train_test_split(XScaled,y,test_size=0.3,random_state=1)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
(592, 18) (592,) (254, 18) (254,)
svm_model = SVC()
svm_model.fit(x_train, y_train)
SVC()
predicted_labels = svm_model.predict(x_test)
svm_model.score(x_test, y_test)
0.952755905511811
y_pred = svm_model.predict(x_test)
print(metrics.classification_report(y_test,y_pred))
precision recall f1-score support
bus 0.89 0.98 0.94 59
car 0.99 0.97 0.98 133
van 0.93 0.89 0.91 62
accuracy 0.95 254
macro avg 0.94 0.95 0.94 254
weighted avg 0.95 0.95 0.95 254
The model shows 95% accuracy using all the attributes given in the dataset
covMatrix = np.cov(XScaled,rowvar=False)
print(covMatrix)
[[ 1.00118343 0.68569786 0.79086299 0.69055952 0.09164265 0.14842463 0.81358214 -0.78968322 0.81465658 0.67694334 0.76297234 0.81497566 0.58593517 -0.24988794 0.23635777 0.15720044 0.29889034 0.36598446] [ 0.68569786 1.00118343 0.79325751 0.6216467 0.15396023 0.25176438 0.8489411 -0.82244387 0.84439802 0.96245572 0.79724837 0.83693508 0.92691166 0.05200785 0.14436828 -0.01145212 -0.10455005 0.04640562] [ 0.79086299 0.79325751 1.00118343 0.76794246 0.15864319 0.26499957 0.90614687 -0.9123854 0.89408198 0.77544391 0.86253904 0.88706577 0.70660663 -0.22621115 0.1140589 0.26586088 0.14627113 0.33312625] [ 0.69055952 0.6216467 0.76794246 1.00118343 0.66423242 0.45058426 0.73529816 -0.79041561 0.70922371 0.56962256 0.79435372 0.71928618 0.53700678 -0.18061084 0.04877032 0.17394649 0.38266622 0.47186659] [ 0.09164265 0.15396023 0.15864319 0.66423242 1.00118343 0.64949139 0.10385472 -0.18325156 0.07969786 0.1270594 0.27323306 0.08929427 0.12211524 0.15313091 -0.05843967 -0.0320139 0.24016968 0.26804208] [ 0.14842463 0.25176438 0.26499957 0.45058426 0.64949139 1.00118343 0.16638787 -0.18035326 0.16169312 0.30630475 0.31933428 0.1434227 0.18996732 0.29608463 0.01561769 0.04347324 -0.02611148 0.14408905] [ 0.81358214 0.8489411 0.90614687 0.73529816 0.10385472 0.16638787 1.00118343 -0.97275069 0.99092181 0.81004084 0.94978498 0.9941867 0.80082111 -0.02757446 0.07454578 0.21267959 0.00563439 0.1189581 ] [-0.78968322 -0.82244387 -0.9123854 -0.79041561 -0.18325156 -0.18035326 -0.97275069 1.00118343 -0.95011894 -0.77677186 -0.93748998 -0.95494487 -0.76722075 0.10342428 -0.05266193 -0.18527244 -0.11526213 -0.2171615 ] [ 0.81465658 0.84439802 0.89408198 0.70922371 0.07969786 0.16169312 0.99092181 -0.95011894 1.00118343 0.81189327 0.93533261 0.98938264 0.79763248 -0.01551372 0.08386628 0.21495454 -0.01867064 0.09940372] [ 0.67694334 0.96245572 0.77544391 0.56962256 0.1270594 0.30630475 0.81004084 -0.77677186 0.81189327 1.00118343 0.74586628 0.79555492 0.86747579 0.04167099 0.13601231 0.00136727 -0.10407076 0.07686047] [ 0.76297234 0.79724837 0.86253904 0.79435372 0.27323306 0.31933428 0.94978498 -0.93748998 0.93533261 0.74586628 1.00118343 0.94679667 0.77983844 0.11321163 0.03677248 0.19446837 0.01423606 0.08579656] [ 0.81497566 0.83693508 0.88706577 0.71928618 0.08929427 0.1434227 0.9941867 -0.95494487 0.98938264 0.79555492 0.94679667 1.00118343 0.79595778 -0.01541878 0.07696823 0.20104818 0.00622636 0.10305714] [ 0.58593517 0.92691166 0.70660663 0.53700678 0.12211524 0.18996732 0.80082111 -0.76722075 0.79763248 0.86747579 0.77983844 0.79595778 1.00118343 0.19169941 0.16667971 -0.05621953 -0.22471583 -0.11814142] [-0.24988794 0.05200785 -0.22621115 -0.18061084 0.15313091 0.29608463 -0.02757446 0.10342428 -0.01551372 0.04167099 0.11321163 -0.01541878 0.19169941 1.00118343 -0.08846001 -0.12633227 -0.749751 -0.80307227] [ 0.23635777 0.14436828 0.1140589 0.04877032 -0.05843967 0.01561769 0.07454578 -0.05266193 0.08386628 0.13601231 0.03677248 0.07696823 0.16667971 -0.08846001 1.00118343 -0.03503155 0.1154338 0.09724079] [ 0.15720044 -0.01145212 0.26586088 0.17394649 -0.0320139 0.04347324 0.21267959 -0.18527244 0.21495454 0.00136727 0.19446837 0.20104818 -0.05621953 -0.12633227 -0.03503155 1.00118343 0.07740174 0.20523257] [ 0.29889034 -0.10455005 0.14627113 0.38266622 0.24016968 -0.02611148 0.00563439 -0.11526213 -0.01867064 -0.10407076 0.01423606 0.00622636 -0.22471583 -0.749751 0.1154338 0.07740174 1.00118343 0.89363767] [ 0.36598446 0.04640562 0.33312625 0.47186659 0.26804208 0.14408905 0.1189581 -0.2171615 0.09940372 0.07686047 0.08579656 0.10305714 -0.11814142 -0.80307227 0.09724079 0.20523257 0.89363767 1.00118343]]
pca = PCA()
pca.fit(XScaled)
PCA()
print(pca.explained_variance_)
[9.40460261e+00 3.01492206e+00 1.90352502e+00 1.17993747e+00 9.17260633e-01 5.39992629e-01 3.58870118e-01 2.21932456e-01 1.60608597e-01 9.18572234e-02 6.64994118e-02 4.66005994e-02 3.57947189e-02 2.74120657e-02 2.05792871e-02 1.79166314e-02 1.00257898e-02 2.96445743e-03]
print(pca.components_)
[[ 2.75283688e-01 2.93258469e-01 3.04609128e-01 2.67606877e-01 8.05039890e-02 9.72756855e-02 3.17092750e-01 -3.14133155e-01 3.13959064e-01 2.82830900e-01 3.09280359e-01 3.13788457e-01 2.72047492e-01 -2.08137692e-02 4.14555082e-02 5.82250207e-02 3.02795063e-02 7.41453913e-02] [-1.26953763e-01 1.25576727e-01 -7.29516436e-02 -1.89634378e-01 -1.22174860e-01 1.07482875e-02 4.81181371e-02 1.27498515e-02 5.99352482e-02 1.16220532e-01 6.22806229e-02 5.37843596e-02 2.09233172e-01 4.88525148e-01 -5.50899716e-02 -1.24085090e-01 -5.40914775e-01 -5.40354258e-01] [-1.19922479e-01 -2.48205467e-02 -5.60143254e-02 2.75074211e-01 6.42012966e-01 5.91801304e-01 -9.76283108e-02 5.76484384e-02 -1.09512416e-01 -1.70641987e-02 5.63239801e-02 -1.08840729e-01 -3.14636493e-02 2.86277015e-01 -1.15679354e-01 -7.52828901e-02 8.73592034e-03 3.95242743e-02] [ 7.83843562e-02 1.87337408e-01 -7.12008427e-02 -4.26053415e-02 3.27257119e-02 3.14147277e-02 -9.57485748e-02 8.22901952e-02 -9.24582989e-02 1.88005612e-01 -1.19844008e-01 -9.17449325e-02 2.00095228e-01 -6.55051354e-02 6.04794251e-01 -6.66114117e-01 1.05526253e-01 4.74890311e-02] [ 6.95178336e-02 -8.50649539e-02 4.06645651e-02 -4.61473714e-02 -4.05494487e-02 2.13432566e-01 -1.54853055e-02 7.68518712e-02 2.17633157e-03 -6.06366845e-02 -4.56472367e-04 -1.95548315e-02 -6.15991681e-02 1.45530146e-01 7.29189842e-01 5.99196401e-01 -1.00602332e-01 -2.98614819e-02] [ 1.44875476e-01 -3.02731148e-01 -1.38405773e-01 2.48136636e-01 2.36932611e-01 -4.19330747e-01 1.16100153e-01 -1.41840112e-01 9.80561329e-02 -4.61674972e-01 2.36225434e-01 1.57820194e-01 -1.35576278e-01 2.41356821e-01 2.03209257e-01 -1.91960802e-01 1.56939174e-01 -2.41222817e-01] [ 4.51862331e-01 -2.49103387e-01 7.40350569e-02 -1.76912814e-01 -3.97876601e-01 5.03413610e-01 6.49879382e-02 1.38112945e-02 9.66573058e-02 -1.04552173e-01 1.14622578e-01 8.37350220e-02 -3.73944382e-01 1.11952983e-01 -8.06328902e-02 -2.84558723e-01 1.81451818e-02 1.57237839e-02] [-5.66136785e-01 -1.79851809e-01 4.34748988e-01 1.01998360e-01 -6.87147927e-02 1.61153097e-01 1.00688056e-01 -2.15497166e-01 6.35933915e-02 -2.49495867e-01 5.02096319e-02 4.37649907e-02 -1.08474496e-01 -3.40878491e-01 1.56487670e-01 -2.08774083e-01 -3.04580219e-01 -3.04186304e-02] [-4.84418105e-01 -1.41569001e-02 -1.67572478e-01 -2.30313563e-01 -2.77128307e-01 1.48032250e-01 5.44574214e-02 -1.56867362e-01 5.24978759e-03 -6.10362445e-02 2.97588112e-01 8.33669838e-02 2.41655483e-01 3.20221887e-01 2.21054148e-02 1.01761758e-02 5.17222779e-01 1.71506343e-01] [-2.60076393e-01 9.80779086e-02 -2.05031597e-01 -4.77888949e-02 1.08075009e-01 -1.18266345e-01 1.65167200e-01 -1.51612333e-01 1.93777917e-01 4.69059999e-01 -1.29986011e-01 1.58203940e-01 -6.86493700e-01 1.27648385e-01 9.83643219e-02 -3.55150608e-02 1.93956186e-02 6.41314778e-02] [ 4.65342885e-02 3.01323693e-03 7.06489498e-01 -1.07151583e-01 3.85169721e-02 -2.62254132e-01 -1.70405800e-01 -5.76632611e-02 -2.72514033e-01 1.41434233e-01 7.72596638e-02 -2.43226301e-01 -1.58888394e-01 4.19188664e-01 -1.25447648e-02 -3.27808069e-02 1.20597635e-01 9.19597847e-02] [ 1.20344026e-02 -2.13635088e-01 3.46330345e-04 -1.57049977e-01 1.10106595e-01 -1.32935328e-01 9.55883216e-02 1.22012715e-01 2.51281206e-01 -1.24529334e-01 -2.15011644e-01 1.75685051e-01 1.90336498e-01 2.85710601e-01 -1.60327156e-03 -8.32589542e-02 -3.53723696e-01 6.85618161e-01] [ 1.56136836e-01 1.50116709e-02 -2.37111452e-01 -3.07818692e-02 -3.92804479e-02 3.72884301e-02 3.94638419e-02 -8.10394855e-01 -2.71573184e-01 -7.57105808e-02 -1.53180808e-01 -3.07948154e-01 3.76087492e-02 4.34650674e-02 9.94304634e-03 2.68915150e-02 -1.86595152e-01 1.42380007e-01] [-6.00485194e-02 4.26993118e-01 -1.46240270e-01 5.21374718e-01 -3.63120360e-01 -6.27796802e-02 -6.40502241e-02 1.86946145e-01 -1.80912790e-01 -1.74070296e-01 2.77272123e-01 -7.85141734e-02 -2.00683948e-01 1.46861607e-01 1.73360301e-02 -3.13689218e-02 -2.31451048e-01 2.88502234e-01] [-9.67780251e-03 -5.97862837e-01 -1.57257142e-01 1.66551725e-01 -6.36138719e-02 -8.63169844e-02 -7.98693109e-02 4.21515054e-02 -1.44490635e-01 5.11259153e-01 4.53236855e-01 -1.26992250e-01 1.09982525e-01 -1.11271959e-01 2.40943096e-02 -9.89651885e-03 -1.82212045e-01 9.04014702e-02] [-6.50956666e-02 -2.61244802e-01 7.82651714e-02 5.60792139e-01 -3.22276873e-01 4.87809642e-02 1.81839668e-02 -2.50330194e-02 1.64490784e-01 1.47280090e-01 -5.64444637e-01 -6.85856929e-02 1.47099233e-01 2.32941262e-01 -2.77589170e-02 2.78187408e-03 1.90629960e-01 -1.20966490e-01] [ 6.00532537e-03 -7.38059396e-02 2.50791236e-02 3.59880417e-02 -1.25847434e-02 2.84168792e-02 2.49652703e-01 4.21478467e-02 -7.17396292e-01 4.70233017e-02 -1.71503771e-01 6.16589383e-01 2.64910290e-02 1.42959461e-02 -1.74310271e-03 7.08894692e-03 -7.67874680e-03 -6.37681817e-03] [-1.00728764e-02 -9.15939674e-03 6.94599696e-03 -4.20156482e-02 3.12698087e-02 -9.99915816e-03 8.40975659e-01 2.38188639e-01 -1.01154594e-01 -1.69481636e-02 6.04665108e-03 -4.69202757e-01 1.17483082e-02 3.14812146e-03 -3.03156233e-03 -1.25315953e-02 4.34282436e-02 -6.47700819e-03]]
The percentage of variation explained by each eigen Vector
print(pca.explained_variance_ratio_)
[5.21860337e-01 1.67297684e-01 1.05626388e-01 6.54745969e-02 5.08986889e-02 2.99641300e-02 1.99136623e-02 1.23150069e-02 8.91215289e-03 5.09714695e-03 3.69004485e-03 2.58586200e-03 1.98624491e-03 1.52109243e-03 1.14194232e-03 9.94191854e-04 5.56329946e-04 1.64497408e-04]
plt.bar(list(range(1,19)), pca.explained_variance_ratio_, alpha = 0.5, align = 'center')
plt.ylabel('Variation explained')
plt.xlabel('Eigen values')
plt.show()
plt.step(list(range(1,19)), np.cumsum(pca.explained_variance_ratio_), where = 'mid')
plt.ylabel('cummulative of variation explained')
plt.xlabel('Eigen value')
plt.show()
In the above plots, increasing principal compoenets drop in variance is observed. Using the above plot 8 principal components were selected as these 8 components explains 95% of variance in the data.
pca8 = PCA(n_components = 8)
pca8.fit(XScaled)
print(pca8.components_)
print(pca8.explained_variance_ratio_)
Xpca3 = pca8.transform(XScaled)
[[ 2.75283688e-01 2.93258469e-01 3.04609128e-01 2.67606877e-01 8.05039890e-02 9.72756855e-02 3.17092750e-01 -3.14133155e-01 3.13959064e-01 2.82830900e-01 3.09280359e-01 3.13788457e-01 2.72047492e-01 -2.08137692e-02 4.14555082e-02 5.82250207e-02 3.02795063e-02 7.41453913e-02] [-1.26953763e-01 1.25576727e-01 -7.29516436e-02 -1.89634378e-01 -1.22174860e-01 1.07482875e-02 4.81181371e-02 1.27498515e-02 5.99352482e-02 1.16220532e-01 6.22806229e-02 5.37843596e-02 2.09233172e-01 4.88525148e-01 -5.50899716e-02 -1.24085090e-01 -5.40914775e-01 -5.40354258e-01] [-1.19922479e-01 -2.48205467e-02 -5.60143254e-02 2.75074211e-01 6.42012966e-01 5.91801304e-01 -9.76283108e-02 5.76484384e-02 -1.09512416e-01 -1.70641987e-02 5.63239801e-02 -1.08840729e-01 -3.14636493e-02 2.86277015e-01 -1.15679354e-01 -7.52828901e-02 8.73592034e-03 3.95242743e-02] [ 7.83843562e-02 1.87337408e-01 -7.12008427e-02 -4.26053415e-02 3.27257119e-02 3.14147277e-02 -9.57485748e-02 8.22901952e-02 -9.24582989e-02 1.88005612e-01 -1.19844008e-01 -9.17449325e-02 2.00095228e-01 -6.55051354e-02 6.04794251e-01 -6.66114117e-01 1.05526253e-01 4.74890311e-02] [ 6.95178336e-02 -8.50649539e-02 4.06645651e-02 -4.61473714e-02 -4.05494487e-02 2.13432566e-01 -1.54853055e-02 7.68518712e-02 2.17633157e-03 -6.06366845e-02 -4.56472367e-04 -1.95548315e-02 -6.15991681e-02 1.45530146e-01 7.29189842e-01 5.99196401e-01 -1.00602332e-01 -2.98614819e-02] [ 1.44875476e-01 -3.02731148e-01 -1.38405773e-01 2.48136636e-01 2.36932611e-01 -4.19330747e-01 1.16100153e-01 -1.41840112e-01 9.80561329e-02 -4.61674972e-01 2.36225434e-01 1.57820194e-01 -1.35576278e-01 2.41356821e-01 2.03209257e-01 -1.91960802e-01 1.56939174e-01 -2.41222817e-01] [ 4.51862331e-01 -2.49103387e-01 7.40350569e-02 -1.76912814e-01 -3.97876601e-01 5.03413610e-01 6.49879382e-02 1.38112945e-02 9.66573058e-02 -1.04552173e-01 1.14622578e-01 8.37350220e-02 -3.73944382e-01 1.11952983e-01 -8.06328902e-02 -2.84558723e-01 1.81451818e-02 1.57237839e-02] [-5.66136785e-01 -1.79851809e-01 4.34748988e-01 1.01998360e-01 -6.87147927e-02 1.61153097e-01 1.00688056e-01 -2.15497166e-01 6.35933915e-02 -2.49495867e-01 5.02096319e-02 4.37649907e-02 -1.08474496e-01 -3.40878491e-01 1.56487670e-01 -2.08774083e-01 -3.04580219e-01 -3.04186304e-02]] [0.52186034 0.16729768 0.10562639 0.0654746 0.05089869 0.02996413 0.01991366 0.01231501]
Xpca3
array([[ 3.34162030e-01, -2.19026358e-01, 1.00158417e+00, ...,
-7.57446693e-01, -9.01124283e-01, -3.81106357e-01],
[-1.59171085e+00, -4.20602982e-01, -3.69033854e-01, ...,
-5.17161832e-01, 3.78636988e-01, 2.47058909e-01],
[ 3.76932418e+00, 1.95282752e-01, 8.78587404e-02, ...,
7.05041037e-01, -3.45837595e-02, 4.82771767e-01],
...,
[ 4.80917387e+00, -1.24931049e-03, 5.32333105e-01, ...,
-2.17069763e-01, 5.73248962e-01, 1.10477865e-01],
[-3.29409242e+00, -1.00827615e+00, -3.57003198e-01, ...,
-4.02491279e-01, -2.02405787e-01, 3.20621635e-01],
[-4.76505347e+00, 3.34899728e-01, -5.68136078e-01, ...,
-3.35637136e-01, 5.80978683e-02, -2.48034955e-01]])
sns.pairplot(pd.DataFrame(Xpca3))
<seaborn.axisgrid.PairGrid at 0x7fed05d25a90>
# split the transformed pca data
pca_x_train, pca_x_test, pca_y_train, pca_y_test = train_test_split(Xpca3, y, test_size = 0.3, random_state = 56)
svm_model.fit(pca_x_train, pca_y_train)
svm_model.score(pca_x_test, pca_y_test)
0.9409448818897638
y_pred = svm_model.predict(pca_x_test)
print(metrics.classification_report(pca_y_test,y_pred))
precision recall f1-score support
bus 0.99 0.92 0.95 72
car 0.96 0.94 0.95 135
van 0.84 0.98 0.90 47
accuracy 0.94 254
macro avg 0.93 0.95 0.93 254
weighted avg 0.95 0.94 0.94 254
The model shows 94% accuray while using 8 principle components
To calculate classification results SVM model has been implemented. PCA has been implemented where we have extracted features by reducing dimensions from 18 to 8. Reducing the dimension of feature space, we have fewer relationships between variables to consider and are less likely to overfit model and as an added benefit, each of the “new” variables after PCA are all independent of one another as seen in pairplot. The accrucy of the results predicted by SVM classifier using all attribute (18) is 95%. However, the classification results predicted by the SVM classifier using PCA dimension reduction technique and using 8 attributes is around 94%.
ipl = pd.read_csv('Part4_batting_bowling_ipl_bat.csv')
ipl.shape
(180, 7)
ipl.head()
| Name | Runs | Ave | SR | Fours | Sixes | HF | |
|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | G Gambhir | 590.0 | 36.87 | 143.55 | 64.0 | 17.0 | 6.0 |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
In the given data set every alternative row contains null value.
df = ipl.iloc[1::2] # dropping rows with null entries
df.head()
| Name | Runs | Ave | SR | Fours | Sixes | HF | |
|---|---|---|---|---|---|---|---|
| 1 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 |
| 3 | G Gambhir | 590.0 | 36.87 | 143.55 | 64.0 | 17.0 | 6.0 |
| 5 | V Sehwag | 495.0 | 33.00 | 161.23 | 57.0 | 19.0 | 5.0 |
| 7 | CL White | 479.0 | 43.54 | 149.68 | 41.0 | 20.0 | 5.0 |
| 9 | S Dhawan | 569.0 | 40.64 | 129.61 | 58.0 | 18.0 | 5.0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 90 entries, 1 to 179 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 90 non-null object 1 Runs 90 non-null float64 2 Ave 90 non-null float64 3 SR 90 non-null float64 4 Fours 90 non-null float64 5 Sixes 90 non-null float64 6 HF 90 non-null float64 dtypes: float64(6), object(1) memory usage: 5.1+ KB
df_1 =df.drop(['Name'], axis=1)
df_1.head()
| Runs | Ave | SR | Fours | Sixes | HF | |
|---|---|---|---|---|---|---|
| 1 | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 |
| 3 | 590.0 | 36.87 | 143.55 | 64.0 | 17.0 | 6.0 |
| 5 | 495.0 | 33.00 | 161.23 | 57.0 | 19.0 | 5.0 |
| 7 | 479.0 | 43.54 | 149.68 | 41.0 | 20.0 | 5.0 |
| 9 | 569.0 | 40.64 | 129.61 | 58.0 | 18.0 | 5.0 |
df_1.isnull().sum()
Runs 0 Ave 0 SR 0 Fours 0 Sixes 0 HF 0 dtype: int64
col= ['Runs', 'Ave', 'SR', 'Fours', 'Sixes', 'HF'] # Not selecting categorica data columns
i=3
j=0
plt.figure(figsize=(14,12))
for k in col :
plt.subplot(i,i,i*(j+1)//i)
sns.distplot(df[k])
j=j+1
plt.show()
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x7fed144f6730>
#lets scale the data
XScaled=df_1.apply(zscore)
XScaled.head()
| Runs | Ave | SR | Fours | Sixes | HF | |
|---|---|---|---|---|---|---|
| 1 | 3.301945 | 2.683984 | 1.767325 | 1.607207 | 6.462679 | 4.651551 |
| 3 | 2.381639 | 0.896390 | 1.036605 | 2.710928 | 1.184173 | 2.865038 |
| 5 | 1.770248 | 0.610640 | 1.788154 | 2.281703 | 1.435530 | 2.269533 |
| 7 | 1.667276 | 1.388883 | 1.297182 | 1.300618 | 1.561209 | 2.269533 |
| 9 | 2.246490 | 1.174755 | 0.444038 | 2.343021 | 1.309851 | 2.269533 |
The data set contains 6 different attributes, hence to rank the players the dimensiionality reduction technique can be used to select most relvant features to rank the player
covMatrix = np.cov(XScaled,rowvar=False)
print(covMatrix)
[[1.01123596 0.70077082 0.49903347 0.9291323 0.77842677 0.84453142] [0.70077082 1.01123596 0.63061271 0.55234856 0.69008186 0.62772842] [0.49903347 0.63061271 1.01123596 0.38913406 0.59050396 0.43238784] [0.9291323 0.55234856 0.38913406 1.01123596 0.52844526 0.79249429] [0.77842677 0.69008186 0.59050396 0.52844526 1.01123596 0.77632221] [0.84453142 0.62772842 0.43238784 0.79249429 0.77632221 1.01123596]]
pca = PCA()
pca.fit(XScaled)
PCA()
print(pca.explained_variance_)
[4.30252561 0.83636692 0.41665751 0.32912443 0.16567829 0.01706297]
print(pca.components_)
[[ 0.4582608 0.39797313 0.3253838 0.40574167 0.41733459 0.43237178] [ 0.26643209 -0.33111756 -0.69780334 0.47355804 -0.17902455 0.27593225] [-0.10977942 0.00550486 -0.45013448 -0.50823538 0.66942589 0.28082541] [-0.00520142 0.84736307 -0.43275029 -0.03252305 -0.24878157 -0.17811777] [ 0.45840889 -0.10122837 -0.11890348 0.09676885 0.39458014 -0.77486668] [ 0.70483594 -0.0606373 0.05624934 -0.58514214 -0.35786211 0.16096217]]
The percentage of variation explained by each eigen Vector
print(pca.explained_variance_ratio_)
[0.70911996 0.13784566 0.06867133 0.05424458 0.02730624 0.00281223]
plt.bar(list(range(1,7)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,7)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
From the above plot it can be seen that the 90% of variance in the data can be explained by 'runs', 'average' and 'strike rate' attributes. Hence, using these attritibute for ranking the players.
df['Ranking'] = df['Runs'].rank(ascending = False)
df.sort_values("Ranking", inplace = True)
df.head()
<ipython-input-473-0f1bf4c2104f>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['Ranking'] = df['Runs'].rank(ascending = False)
<ipython-input-473-0f1bf4c2104f>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df.sort_values("Ranking", inplace = True)
| Name | Runs | Ave | SR | Fours | Sixes | HF | Ranking | |
|---|---|---|---|---|---|---|---|---|
| 1 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 | 1.0 |
| 3 | G Gambhir | 590.0 | 36.87 | 143.55 | 64.0 | 17.0 | 6.0 | 2.0 |
| 9 | S Dhawan | 569.0 | 40.64 | 129.61 | 58.0 | 18.0 | 5.0 | 3.0 |
| 11 | AM Rahane | 560.0 | 40.00 | 129.33 | 73.0 | 10.0 | 5.0 | 4.0 |
| 5 | V Sehwag | 495.0 | 33.00 | 161.23 | 57.0 | 19.0 | 5.0 | 5.0 |
df['Ranking'] = df['Ave'].rank(ascending = False)
df.sort_values("Ranking", inplace = True)
df.head()
<ipython-input-474-b004de99e331>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['Ranking'] = df['Ave'].rank(ascending = False)
<ipython-input-474-b004de99e331>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df.sort_values("Ranking", inplace = True)
| Name | Runs | Ave | SR | Fours | Sixes | HF | Ranking | |
|---|---|---|---|---|---|---|---|---|
| 19 | JP Duminy | 244.0 | 81.33 | 128.42 | 13.0 | 11.0 | 2.0 | 1.0 |
| 1 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 | 2.0 |
| 13 | KP Pietersen | 305.0 | 61.00 | 147.34 | 22.0 | 20.0 | 3.0 | 3.0 |
| 29 | DJ Bravo | 371.0 | 46.37 | 140.53 | 20.0 | 20.0 | 0.0 | 4.0 |
| 7 | CL White | 479.0 | 43.54 | 149.68 | 41.0 | 20.0 | 5.0 | 5.0 |
df['Ranking'] = df['SR'].rank(ascending = False)
df.sort_values("Ranking", inplace = True)
df.head()
<ipython-input-475-39e033a44e72>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['Ranking'] = df['SR'].rank(ascending = False)
<ipython-input-475-39e033a44e72>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df.sort_values("Ranking", inplace = True)
| Name | Runs | Ave | SR | Fours | Sixes | HF | Ranking | |
|---|---|---|---|---|---|---|---|---|
| 21 | DA Warner | 256.0 | 36.57 | 164.10 | 28.0 | 14.0 | 3.0 | 1.0 |
| 5 | V Sehwag | 495.0 | 33.00 | 161.23 | 57.0 | 19.0 | 5.0 | 2.0 |
| 17 | AB de Villiers | 319.0 | 39.87 | 161.11 | 26.0 | 15.0 | 3.0 | 3.0 |
| 1 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 | 4.0 |
| 41 | DR Smith | 157.0 | 39.25 | 160.20 | 18.0 | 7.0 | 1.0 | 5.0 |
The most commanly implemented Dimensionality Reduction Techniques in python are Missing Value Ratio 1) Low Variance Filter
2) High Correlation Filter
3) Random Forest
4) Backward Feature Elimination
5) Forward Feature Selection
6) Factor Analysis
7) Principal Component Analysis
8) Independent Component Analysis
9) Methods Based on Projections
10) t-Distributed Stochastic Neighbor Embedding (t-SNE)
11) UMAP